This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(ISLR))
suppressMessages(library(caret))
suppressMessages(library(doMC))
library(plotly)
registerDoMC(cores=4)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

Getting and proccesing the data

library(stringr)
myData = read.csv('/home/harpo/Dropbox/ongoing-work/git-repos/labeling-datasets/phd/datasets/data_all_result.txt', stringsAsFactors = F, sep = ' ')

#Create data backup
myData.bkup <- myData
#Create new column: length of model, and number of periodicity, duration and size characteristic in the model.
myData = myData %>% mutate(letter_count = nchar(State))
#Periodicity
myData = myData %>% mutate(strong_p = str_count(State,'[a-i]'))
myData = myData %>% mutate(weak_p = str_count(State,'[A-I]'))
myData = myData %>% mutate(weak_np = str_count(State,'[r-z]'))
myData = myData %>% mutate(strong_np = str_count(State,'[R-Z]'))
#Duration
myData = myData %>% mutate(duration_s = str_count(State,'(a|A|r|R|1|d|D|u|U|4|g|G|x|X|7)'))#c('a','A','r','R','1','d','D','u','U','4','g','G','x','X','7')))
myData = myData %>% mutate(duration_m = str_count(State,'(b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8)'))#c('b','B','s','S','2','e','E','v','V','5','h','H','y','Y','8')))
myData = myData %>% mutate(duration_l = str_count(State,'(c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9)'))#c('c','C','t','T','3','f','F','w','W','6','i','I','z','Z','9')))
#Size
myData = myData %>% mutate(size_s = str_count(State,'[a-c]') + str_count(State,'[A-C]') + str_count(State,'[r-t]') + str_count(State,'[R-T]') + str_count(State,'[1-3]'))
myData = myData %>% mutate(size_m = str_count(State,'[d-f]') + str_count(State,'[D-F]') + str_count(State,'[u-w]') + str_count(State,'[U-W]') + str_count(State,'[4-6]'))
myData = myData %>% mutate(size_l = str_count(State,'[g-i]') + str_count(State,'[G-I]') + str_count(State,'[x-z]') + str_count(State,'[X-Z]') + str_count(State,'[7-9]'))

#Remove from LabelName unnecessary characters (ej: V42, -17)
myData <- myData %>% mutate(LabelName = gsub('V[0-9]+-','',LabelName))
myData <- myData %>% mutate(LabelName = gsub('-[0-9]+','',LabelName))
myData <- myData %>% mutate(LabelName = gsub('CC[0-9]+-','CC-',LabelName))

#Keep only connection with more than 3 symbols
myData <- myData %>% filter(letter_count > 3)

#Periodicity %
myData <- myData %>% mutate(strong_p = (strong_p / letter_count))
myData <- myData %>% mutate(weak_p = (weak_p / letter_count))
myData <- myData %>% mutate(strong_np = (strong_np / letter_count))
myData <- myData %>% mutate(weak_np = (weak_np / letter_count))
#Duration %
myData <- myData %>% mutate(duration_s = (duration_s / letter_count))
myData <- myData %>% mutate(duration_m = (duration_m / letter_count))
myData <- myData %>% mutate(duration_l = (duration_l / letter_count))
#Size %
myData <- myData %>% mutate(size_s = (size_s / letter_count))
myData <- myData %>% mutate(size_m = (size_m / letter_count))
myData <- myData %>% mutate(size_l = (size_l / letter_count))

#head(myData)
myData[1:20,]

#Making feature vectors
feature_vectors = myData[,c('strong_p','weak_p','weak_np','strong_np','duration_s','duration_m','duration_l','size_s','size_m','size_l','letter_count','Label','LabelName')]
names(feature_vectors) = c("sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","length","class","subclass")
feature_vectors$class = factor(feature_vectors$class)
feature_vectors$subclass = factor(feature_vectors$subclass)

Create training set and testset

set.seed(300)
trainIndex <- createDataPartition(feature_vectors$class, p=0.80, list=FALSE)
data_train <- feature_vectors[ trainIndex,]
data_test <- feature_vectors[-trainIndex,]
#data_train = data_train %>% filter(length>5)
train <- upSample(x = data_train,  y = data_train$class, yname="class")
#train <- upSample(x = train,  y = train$subclass, yname="class")
training <- train[,-c(11,12)]
testing <- data_test[,-c(11)]
training
ctrl_fast <- trainControl(method="cv", 
                     repeats=2,
                     number=10, 
                     summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=TRUE,
                     allowParallel = TRUE)  
ctrl <- trainControl(method="repeatedcv",repeats = 3) #,classProbs=TRUE,summaryFunction = twoClassSummary)

Random Forest Classificator

  # Random Forest
rfFit <- train(class ~ .,
               data = training,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)

rfFit
rfFit$finalModel
predsrfprobs=predict(rfFit,testing,type='prob')
predsrf=ifelse(predsrfprobs$Botnet >=0.9,'Botnet','Normal')
confusionMatrix(predsrf,testing$class)
library(ggplot2)
library(plotROC)
selectedIndices <- rfFit$pred$mtry == 2
ggplot(cbind(predsrfprobs,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

cbind(predsrfprobs,class=testing$class)

KNN

#Checking distibution in origanl data and partitioned data
prop.table(table(training$class)) * 100

Botnet Normal 
    50     50 
prop.table(table(testing$class)) * 100

  Botnet   Normal 
69.25986 30.74014 
prop.table(table(feature_vectors$class)) * 100

 Botnet  Normal 
69.2449 30.7551 
trainX <- training[,names(training) != "class"]
preProcValues <- preProcess(x = trainX,method = c("center", "scale"))
preProcValues
Created from 10256 samples and 11 variables

Pre-processing:
  - centered (10)
  - ignored (1)
  - scaled (10)
knnFit <- train(class ~ ., data = training, method = "knn", trControl = ctrl_fast, preProcess = c("center","scale"), tuneLength = 20)
The metric "Accuracy" was not in the result set. ROC will be used instead.
+ Fold01: k= 5 
- Fold01: k= 5 
+ Fold01: k= 7 
- Fold01: k= 7 
+ Fold01: k= 9 
- Fold01: k= 9 
+ Fold01: k=11 
- Fold01: k=11 
+ Fold01: k=13 
- Fold01: k=13 
+ Fold01: k=15 
- Fold01: k=15 
+ Fold01: k=17 
- Fold01: k=17 
+ Fold01: k=19 
- Fold01: k=19 
+ Fold01: k=21 
- Fold01: k=21 
+ Fold01: k=23 
- Fold01: k=23 
+ Fold01: k=25 
- Fold01: k=25 
+ Fold01: k=27 
- Fold01: k=27 
+ Fold01: k=29 
- Fold01: k=29 
+ Fold01: k=31 
- Fold01: k=31 
+ Fold01: k=33 
- Fold01: k=33 
+ Fold01: k=35 
- Fold01: k=35 
+ Fold01: k=37 
- Fold01: k=37 
+ Fold01: k=39 
- Fold01: k=39 
+ Fold01: k=41 
- Fold01: k=41 
+ Fold01: k=43 
- Fold01: k=43 
+ Fold02: k= 5 
- Fold02: k= 5 
+ Fold02: k= 7 
- Fold02: k= 7 
+ Fold02: k= 9 
- Fold02: k= 9 
+ Fold02: k=11 
- Fold02: k=11 
+ Fold02: k=13 
- Fold02: k=13 
+ Fold02: k=15 
- Fold02: k=15 
+ Fold02: k=17 
- Fold02: k=17 
+ Fold02: k=19 
- Fold02: k=19 
+ Fold02: k=21 
- Fold02: k=21 
+ Fold02: k=23 
- Fold02: k=23 
+ Fold02: k=25 
- Fold02: k=25 
+ Fold02: k=27 
- Fold02: k=27 
+ Fold02: k=29 
- Fold02: k=29 
+ Fold02: k=31 
- Fold02: k=31 
+ Fold02: k=33 
- Fold02: k=33 
+ Fold02: k=35 
- Fold02: k=35 
+ Fold02: k=37 
- Fold02: k=37 
+ Fold02: k=39 
- Fold02: k=39 
+ Fold02: k=41 
- Fold02: k=41 
+ Fold02: k=43 
- Fold02: k=43 
+ Fold03: k= 5 
- Fold03: k= 5 
+ Fold03: k= 7 
- Fold03: k= 7 
+ Fold03: k= 9 
- Fold03: k= 9 
+ Fold03: k=11 
- Fold03: k=11 
+ Fold03: k=13 
- Fold03: k=13 
+ Fold03: k=15 
- Fold03: k=15 
+ Fold03: k=17 
- Fold03: k=17 
+ Fold03: k=19 
- Fold03: k=19 
+ Fold03: k=21 
- Fold03: k=21 
+ Fold03: k=23 
- Fold03: k=23 
+ Fold03: k=25 
- Fold03: k=25 
+ Fold03: k=27 
- Fold03: k=27 
+ Fold03: k=29 
- Fold03: k=29 
+ Fold03: k=31 
- Fold03: k=31 
+ Fold03: k=33 
- Fold03: k=33 
+ Fold03: k=35 
- Fold03: k=35 
+ Fold03: k=37 
- Fold03: k=37 
+ Fold03: k=39 
- Fold03: k=39 
+ Fold03: k=41 
- Fold03: k=41 
+ Fold03: k=43 
- Fold03: k=43 
+ Fold04: k= 5 
- Fold04: k= 5 
+ Fold04: k= 7 
- Fold04: k= 7 
+ Fold04: k= 9 
- Fold04: k= 9 
+ Fold04: k=11 
- Fold04: k=11 
+ Fold04: k=13 
- Fold04: k=13 
+ Fold04: k=15 
- Fold04: k=15 
+ Fold04: k=17 
- Fold04: k=17 
+ Fold04: k=19 
- Fold04: k=19 
+ Fold04: k=21 
- Fold04: k=21 
+ Fold04: k=23 
- Fold04: k=23 
+ Fold04: k=25 
- Fold04: k=25 
+ Fold04: k=27 
- Fold04: k=27 
+ Fold04: k=29 
- Fold04: k=29 
+ Fold04: k=31 
- Fold04: k=31 
+ Fold04: k=33 
- Fold04: k=33 
+ Fold04: k=35 
- Fold04: k=35 
+ Fold04: k=37 
- Fold04: k=37 
+ Fold04: k=39 
- Fold04: k=39 
+ Fold04: k=41 
- Fold04: k=41 
+ Fold04: k=43 
- Fold04: k=43 
+ Fold05: k= 5 
- Fold05: k= 5 
+ Fold05: k= 7 
- Fold05: k= 7 
+ Fold05: k= 9 
- Fold05: k= 9 
+ Fold05: k=11 
- Fold05: k=11 
+ Fold05: k=13 
- Fold05: k=13 
+ Fold05: k=15 
- Fold05: k=15 
+ Fold05: k=17 
- Fold05: k=17 
+ Fold05: k=19 
- Fold05: k=19 
+ Fold05: k=21 
- Fold05: k=21 
+ Fold05: k=23 
- Fold05: k=23 
+ Fold05: k=25 
- Fold05: k=25 
+ Fold05: k=27 
- Fold05: k=27 
+ Fold05: k=29 
- Fold05: k=29 
+ Fold05: k=31 
- Fold05: k=31 
+ Fold05: k=33 
- Fold05: k=33 
+ Fold05: k=35 
- Fold05: k=35 
+ Fold05: k=37 
- Fold05: k=37 
+ Fold05: k=39 
- Fold05: k=39 
+ Fold05: k=41 
- Fold05: k=41 
+ Fold05: k=43 
- Fold05: k=43 
+ Fold06: k= 5 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
- Fold06: k= 5 
+ Fold06: k= 7 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
- Fold06: k= 7 
+ Fold06: k= 9 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
- Fold06: k= 9 
+ Fold06: k=11 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
- Fold06: k=11 
+ Fold06: k=13 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
- Fold06: k=13 
+ Fold06: k=15 
These variables have zero variances: subclassFrom-Botnet-TCP-CC-HTTP-Custom-Port-Not-Encrypted-Non-Periodic
#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knnFit)
knnPredict <- predict(knnFit,newdata = testing )
#Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(knnPredict, testing$class )
mean(knnPredict == testing$class)
library(pROC)
knnPredict <- predict(knnFit,newdata = testing , type="prob")
knnROC <- roc(testing$class,knnPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
knnROC
ggplot(cbind(knnPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#plot(knnROC, type="S", print.thres= 0.5)

Logistic Regression

logicRFit <- train(class ~ ., method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
#logicRFit <- train(class ~ sp*wp*wnp*snp*ds*dm*dl*ss*sm*sl, method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
#logicRFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl, method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))

#summary(logicRFit)
#Output of Logistic Regression fit
logicRFit

logicRPredict <- predict(logicRFit, newdata = testing )

confusionMatrix(logicRPredict, testing$class)
logicRPredict <- predict(logicRFit, newdata = testing, type="prob")
logicROC <- roc(testing$class,logicRPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))

ggplot(cbind(logicRPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#logicROC

Naive Bayes

naiveBayesFit <- train(class ~ ., method='nb', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training)
naiveBayesFit
naiveBayesPredict <- predict(naiveBayesFit, newdata = testing)

confusionMatrix(naiveBayesPredict, testing$class)
naiveBayesPredict <- predict(naiveBayesFit, newdata = testing, type = 'prob')
naiveBayesROC <- roc(testing$class,naiveBayesPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
naiveBayesROC

ggplot(cbind(naiveBayesPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#plot(naiveBayesROC, type="S", print.thres= 0.5)

Suport Vector Machine

svmFit <- train(class ~ ., method='svmLinear', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
svmFit
svmPredict <- predict(svmFit, newdata = testing)
confusionMatrix(svmPredict, testing$class)
svmPredict <- predict(svmFit, newdata = testing, type = "prob")

svmROC <- roc(testing$class,svmPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
svmROC

ggplot(cbind(svmPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

Comparing Models

resamps <- resamples(list(rf = rfFit, lr = logicRFit, nv = naiveBayesFit, svm = svmFit))
Error in resamples(list(rf = rfFit, lr = logicRFit, nv = naiveBayesFit,  : 
  objeto 'rfFit' no encontrado

Making probabilistic table.

#load("./botnet_prob_results.Rda")
library(grid)
library(gridExtra)
#botnet_prob_result %>% group_by(subclass) %>% summarise(n=n(),mean=mean(KNN),sd=sd(KNN)) %>% arrange(desc(n))
#botnet_prob_result %>% group_by(subclass) %>% summarise(n=n(),mean=mean(NaiveBayes),sd=sd(NaiveBayes)) %>% arrange(desc(n))%>% top_n(10)
botnet_10_top<-botnet_prob_result %>% filter(TrueClass=="Normal") %>% group_by(subclass) %>% summarise(n=n()) %>% arrange(desc(n))%>% top_n(20)
Selecting by n
botnet_10_top<-inner_join(botnet_10_top,botnet_prob_result,by="subclass")
knn_plot<-bwplot(subclass~KNN,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
rf_plot<-bwplot(subclass~RamdomForest,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
rl_plot<-bwplot(subclass~LogisticRegression,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
svm_plot<-bwplot(subclass~SVM,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
pl = list(knn_plot, rf_plot,rl_plot,svm_plot)
# do.call(grid.arrange, c(pl, nrow=1))
do.call(grid.arrange, c(lapply(pl, update), list(nrow=1)))

some heatmaps

library(scales)
knn_m<-matrix(botnet_prob_result[1:1830,]$KNN,ncol=30,nrow=61)
svm_m<-matrix(botnet_prob_result[1:1830,]$SVM,ncol=30,nrow=61)
lr_m<-matrix(botnet_prob_result[1:1830,]$LogisticRegression,ncol=30,nrow=61)
nb_m<-matrix(botnet_prob_result[1:1830,]$NaiveBayes,ncol=30,nrow=61)
rf_m<-matrix(botnet_prob_result[1:1830,]$RamdomForest,ncol=30,nrow=61)
mdf<-as.data.frame(knn_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h1<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)
mdf<-as.data.frame(svm_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h2<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)
mdf<-as.data.frame(rf_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h3<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)
mdf<-as.data.frame(nb_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h4<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)
grid.arrange(h1,h2,h3,h4,ncol=2,nrow=2)

difference heatmaps

mdf<-as.data.frame(rf_m - knn_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
mdf<-cbind(mdf,subclass=(botnet_prob_result[1:1830,]$subclass))
diff<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value,text=subclass),
            colour = "white") +
  scale_fill_gradientn(colours=c("red","white","green"),
           values  = rescale(c(min(mdf$value), 0.05, max(mdf$value))))+
           guides(fill=FALSE)+theme_bw()  
Ignoring unknown aesthetics: text
ggplotly(diff)

d3heatmap(rf_m - knn_m,colors = "Blues",cellnote=matrix(botnet_prob_result[1:1830,]$subclass,ncol=30,nrow=61))

Subclass probability analisys (Attempt I)

KNN vs RF.

a=mdf %>% filter(value< -0.09) %>% group_by(subclass) %>% summarise(totless009=n()) %>% arrange(desc(totless009))
b=mdf  %>% group_by(subclass) %>% summarise(total=n()) %>% arrange(desc(total))
subclass_percent_diff<-inner_join(a,b,by="subclass") %>% mutate(percent=totless009/total) %>% arrange(desc(total)) 
subclass_percent_diff
subclass_detections<-botnet_prob_result %>% mutate(detected_rf=ifelse(RamdomForest>0.5,"Botnet","Normal"),
                              detected_knn=ifelse(KNN>0.5,"Botnet","Normal")) %>% 
                              mutate(correct_rf=ifelse(detected_rf==TrueClass,1,0))%>%
                              mutate(correct_knn=ifelse(detected_knn==TrueClass,1,0)) %>% 
  group_by(subclass) %>% summarise(total_correct_rf=sum(correct_rf),total_correct_knn=sum(correct_knn))
inner_join(subclass_detections,subclass_percent_diff,by="subclass")
mdf<-cbind(mdf,rf=botnet_prob_result$RamdomForest[1:1830],knn=botnet_prob_result$KNN[1:1830],trueclass=botnet_prob_result$TrueClass[1:1830])
botnet_prob_result
mdf %>% filter(value< -0.09) %>% filter(trueclass=="Botnet")
mdf %>% filter(value> 0.09) %>% filter(trueclass=="Normal")

Clustering and PCA

kmeans_mod<-kmeans(testing[,1:10],centers = 10)
testing_cluster<-cbind(testing,cluster=kmeans_mod$cluster)
pca<-prcomp(testing[,c(-11,-12)], center = TRUE, scale. = TRUE) 
pca_testing<-data.frame(pca$x,class=testing_cluster$class,
                              subclass=testing_cluster$subclass,
                               cluster=testing_cluster$cluster
                               )
g<-ggplot(pca_testing,aes(x=PC1,y=PC2))+
  geom_jitter(aes(color=as.factor(subclass),text=cluster,shape=class))+
  #geom_point(aes(shape=asignacion),size=3)+
  ylab("PC1")+xlab("PC2")+
  theme_classic()+
#scale_shape_manual(values=c(8,6))+
   guides(color=FALSE,alpha=FALSE)
Ignoring unknown aesthetics: text
ggplotly(g)
#Normal probabilistic table
normal_prob_result = data.frame(testing$class, predsrfprobs$Normal, knnPredict$Normal, logicRPredict$Normal, naiveBayesPredict$Normal ,svmPredict$Normal)
names(normal_prob_result) = c('True Class','Ramdom Forest','KNN','Logistic Regression', 'Naive Bayes', 'Suport VM')
normal_prob_result
---
title: "Connection Classification"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}
suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(ISLR))
suppressMessages(library(caret))
suppressMessages(library(doMC))
library(plotly)
registerDoMC(cores=4)

```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

### Getting and proccesing the data
```{r}
library(stringr)
myData = read.csv('/home/harpo/Dropbox/ongoing-work/git-repos/labeling-datasets/phd/datasets/data_all_result.txt', stringsAsFactors = F, sep = ' ')

#Create data backup
myData.bkup <- myData
#Create new column: length of model, and number of periodicity, duration and size characteristic in the model.
myData = myData %>% mutate(letter_count = nchar(State))
#Periodicity
myData = myData %>% mutate(strong_p = str_count(State,'[a-i]'))
myData = myData %>% mutate(weak_p = str_count(State,'[A-I]'))
myData = myData %>% mutate(weak_np = str_count(State,'[r-z]'))
myData = myData %>% mutate(strong_np = str_count(State,'[R-Z]'))
#Duration
myData = myData %>% mutate(duration_s = str_count(State,'(a|A|r|R|1|d|D|u|U|4|g|G|x|X|7)'))#c('a','A','r','R','1','d','D','u','U','4','g','G','x','X','7')))
myData = myData %>% mutate(duration_m = str_count(State,'(b|B|s|S|2|e|E|v|V|5|h|H|y|Y|8)'))#c('b','B','s','S','2','e','E','v','V','5','h','H','y','Y','8')))
myData = myData %>% mutate(duration_l = str_count(State,'(c|C|t|T|3|f|F|w|W|6|i|I|z|Z|9)'))#c('c','C','t','T','3','f','F','w','W','6','i','I','z','Z','9')))
#Size
myData = myData %>% mutate(size_s = str_count(State,'[a-c]') + str_count(State,'[A-C]') + str_count(State,'[r-t]') + str_count(State,'[R-T]') + str_count(State,'[1-3]'))
myData = myData %>% mutate(size_m = str_count(State,'[d-f]') + str_count(State,'[D-F]') + str_count(State,'[u-w]') + str_count(State,'[U-W]') + str_count(State,'[4-6]'))
myData = myData %>% mutate(size_l = str_count(State,'[g-i]') + str_count(State,'[G-I]') + str_count(State,'[x-z]') + str_count(State,'[X-Z]') + str_count(State,'[7-9]'))

#Remove from LabelName unnecessary characters (ej: V42, -17)
myData <- myData %>% mutate(LabelName = gsub('V[0-9]+-','',LabelName))
myData <- myData %>% mutate(LabelName = gsub('-[0-9]+','',LabelName))
myData <- myData %>% mutate(LabelName = gsub('CC[0-9]+-','CC-',LabelName))

#Keep only connection with more than 3 symbols
myData <- myData %>% filter(letter_count > 3)

#Periodicity %
myData <- myData %>% mutate(strong_p = (strong_p / letter_count))
myData <- myData %>% mutate(weak_p = (weak_p / letter_count))
myData <- myData %>% mutate(strong_np = (strong_np / letter_count))
myData <- myData %>% mutate(weak_np = (weak_np / letter_count))
#Duration %
myData <- myData %>% mutate(duration_s = (duration_s / letter_count))
myData <- myData %>% mutate(duration_m = (duration_m / letter_count))
myData <- myData %>% mutate(duration_l = (duration_l / letter_count))
#Size %
myData <- myData %>% mutate(size_s = (size_s / letter_count))
myData <- myData %>% mutate(size_m = (size_m / letter_count))
myData <- myData %>% mutate(size_l = (size_l / letter_count))

#head(myData)
myData[1:20,]

#Making feature vectors
feature_vectors = myData[,c('strong_p','weak_p','weak_np','strong_np','duration_s','duration_m','duration_l','size_s','size_m','size_l','letter_count','Label','LabelName')]
names(feature_vectors) = c("sp","wp","wnp","snp","ds","dm","dl","ss","sm","sl","length","class","subclass")
feature_vectors$class = factor(feature_vectors$class)
feature_vectors$subclass = factor(feature_vectors$subclass)
```

### Create training set and testset
```{r}
set.seed(300)
trainIndex <- createDataPartition(feature_vectors$class, p=0.80, list=FALSE)
data_train <- feature_vectors[ trainIndex,]
data_test <- feature_vectors[-trainIndex,]

#data_train = data_train %>% filter(length>5)
train <- upSample(x = data_train,  y = data_train$class, yname="class")
#train <- upSample(x = train,  y = train$subclass, yname="class")
training <- train[,-c(11,12)]
testing <- data_test[,-c(11)]
training
train
ctrl_fast <- trainControl(method="cv", 
                     repeats=2,
                     number=10, 
                     summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=TRUE,
                     allowParallel = TRUE)  
ctrl <- trainControl(method="repeatedcv",repeats = 3) #,classProbs=TRUE,summaryFunction = twoClassSummary)
```


### Random Forest Classificator
```{r}
  # Random Forest
rfFit <- train(class ~ .,
               data = training,
               metric="ROC",
               method = "rf",
               trControl = ctrl_fast)

rfFit
rfFit$finalModel
```

```{r}
predsrfprobs=predict(rfFit,testing,type='prob')
predsrf=ifelse(predsrfprobs$Botnet >=0.9,'Botnet','Normal')
confusionMatrix(predsrf,testing$class)
```

```{r}
library(ggplot2)
library(plotROC)
selectedIndices <- rfFit$pred$mtry == 2
ggplot(cbind(predsrfprobs,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

cbind(predsrfprobs,class=testing$class)
```


### KNN
```{r}
#Checking distibution in origanl data and partitioned data
prop.table(table(training$class)) * 100
prop.table(table(testing$class)) * 100
prop.table(table(feature_vectors$class)) * 100

trainX <- training[,names(training) != "class"]
preProcValues <- preProcess(x = trainX,method = c("center", "scale"))
preProcValues
```
```{r}
knnFit <- train(class ~ ., data = training, method = "knn", trControl = ctrl_fast, preProcess = c("center","scale"), tuneLength = 20)

#Output of kNN fit
knnFit
```
```{r}
#Plotting yields Number of Neighbours Vs accuracy (based on repeated cross validation)
plot(knnFit)
```
```{r}
knnPredict <- predict(knnFit,newdata = testing )
#Get the confusion matrix to see accuracy value and other parameter values
confusionMatrix(knnPredict, testing$class )
```
```{r}
mean(knnPredict == testing$class)
```
```{r}
library(pROC)
knnPredict <- predict(knnFit,newdata = testing , type="prob")
knnROC <- roc(testing$class,knnPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
knnROC
```

```{r}
ggplot(cbind(knnPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#plot(knnROC, type="S", print.thres= 0.5)
```
### Logistic Regression

```{r}
logicRFit <- train(class ~ ., method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
#logicRFit <- train(class ~ sp*wp*wnp*snp*ds*dm*dl*ss*sm*sl, method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
#logicRFit <- train(class ~ sp+wp+wnp+snp+ds+dm+dl+ss+sm+sl, method='glm', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))

#summary(logicRFit)
#Output of Logistic Regression fit
logicRFit
```

```{r}

logicRPredict <- predict(logicRFit, newdata = testing )

confusionMatrix(logicRPredict, testing$class)
```
```{r}
logicRPredict <- predict(logicRFit, newdata = testing, type="prob")
logicROC <- roc(testing$class,logicRPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))

ggplot(cbind(logicRPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#logicROC
```

### Naive Bayes
```{r}
naiveBayesFit <- train(class ~ ., method='nb', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training)
naiveBayesFit
```

```{r}
naiveBayesPredict <- predict(naiveBayesFit, newdata = testing)

confusionMatrix(naiveBayesPredict, testing$class)
```
```{r}
naiveBayesPredict <- predict(naiveBayesFit, newdata = testing, type = 'prob')
naiveBayesROC <- roc(testing$class,naiveBayesPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
naiveBayesROC

ggplot(cbind(naiveBayesPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

#plot(naiveBayesROC, type="S", print.thres= 0.5)
```


### Suport Vector Machine
```{r}
svmFit <- train(class ~ ., method='svmLinear', trControl = ctrl_fast,preProcess=c('scale', 'center'), data=training, family=binomial(link='logit'))
svmFit
```

```{r}
svmPredict <- predict(svmFit, newdata = testing)
confusionMatrix(svmPredict, testing$class)
```

```{r}
svmPredict <- predict(svmFit, newdata = testing, type = "prob")

svmROC <- roc(testing$class,svmPredict[,"Botnet"], levels = c('Normal','Botnet'))#rev(testing$class))
svmROC

ggplot(cbind(svmPredict,class=testing$class), 
       aes(m = Botnet, d = factor(class, labels=c("Normal","Botnet"),levels = c("Normal", "Botnet")))) + 
    geom_roc(hjust = -0.4, vjust = 1.5,colour='orange') + 
  theme_bw()

```


### Comparing Models
```{r}
resamps <- resamples(list(rf = rfFit, lr = logicRFit, nv = naiveBayesFit, svm = svmFit))
summary(resamps)
bwplot(resamps)
diffs <- diff(resamps)
summary(diffs)
values=resamps$values
values
names(values)[2]<-"rfSens"

ggplot(values)+
  geom_boxplot(aes(y=rfSens,x=1))

```
### Making probabilistic table.
```{r}
# Botnet probabilistic table
botnet_prob_result = data.frame(testing$class, predsrfprobs$Botnet, knnPredict$Botnet, logicRPredict$Botnet, naiveBayesPredict$Botnet ,svmPredict$Botnet)
botnet_prob_result = botnet_prob_result %>% mutate(subclass = data_test$subclass)
names(botnet_prob_result) = c('TrueClass','RamdomForest','KNN','LogisticRegression', 'NaiveBayes', 'SVM','subclass')
botnet_prob_result
```


```{r}
#load("./botnet_prob_results.Rda")
library(grid)
library(gridExtra)
#botnet_prob_result %>% group_by(subclass) %>% summarise(n=n(),mean=mean(KNN),sd=sd(KNN)) %>% arrange(desc(n))
#botnet_prob_result %>% group_by(subclass) %>% summarise(n=n(),mean=mean(NaiveBayes),sd=sd(NaiveBayes)) %>% arrange(desc(n))%>% top_n(10)

botnet_10_top<-botnet_prob_result %>% filter(TrueClass=="Normal") %>% group_by(subclass) %>% summarise(n=n()) %>% arrange(desc(n))%>% top_n(20)
botnet_10_top<-inner_join(botnet_10_top,botnet_prob_result,by="subclass")

knn_plot<-bwplot(subclass~KNN,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
rf_plot<-bwplot(subclass~RamdomForest,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
rl_plot<-bwplot(subclass~LogisticRegression,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))
svm_plot<-bwplot(subclass~SVM,data=botnet_10_top,do.out = FALSE,scales=list(y=list(draw=FALSE)))


pl = list(knn_plot, rf_plot,rl_plot,svm_plot)
# do.call(grid.arrange, c(pl, nrow=1))
do.call(grid.arrange, c(lapply(pl, update), list(nrow=1)))

```

#### some heatmaps
```{r heatmaps}
library(scales)
knn_m<-matrix(botnet_prob_result[1:1830,]$KNN,ncol=30,nrow=61)
svm_m<-matrix(botnet_prob_result[1:1830,]$SVM,ncol=30,nrow=61)
lr_m<-matrix(botnet_prob_result[1:1830,]$LogisticRegression,ncol=30,nrow=61)
nb_m<-matrix(botnet_prob_result[1:1830,]$NaiveBayes,ncol=30,nrow=61)
rf_m<-matrix(botnet_prob_result[1:1830,]$RamdomForest,ncol=30,nrow=61)

mdf<-as.data.frame(knn_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h1<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)


mdf<-as.data.frame(svm_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h2<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)


mdf<-as.data.frame(rf_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h3<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)


mdf<-as.data.frame(nb_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
h4<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value),
            colour = "white") +
  scale_fill_gradient(low = "white",
    high = "orange")+ylab("")+xlab("")+
  guides(fill=FALSE)
grid.arrange(h1,h2,h3,h4,ncol=2,nrow=2)
```

#### difference heatmaps
```{r heatmap diff}
mdf<-as.data.frame(rf_m - knn_m)
mdf<-cbind(mdf,id=seq(1:61))
mdf<-reshape2::melt(mdf,id.vars=c("id"))
mdf<-cbind(mdf,subclass=(botnet_prob_result[1:1830,]$subclass))

diff<-ggplot(mdf)+
  geom_tile(aes(x=id,y=variable,fill=value,text=subclass),
            colour = "white") +
  scale_fill_gradientn(colours=c("red","white","green"),
           values  = rescale(c(min(mdf$value), 0.05, max(mdf$value))))+
           guides(fill=FALSE)+theme_bw()  
ggplotly(diff)
d3heatmap(rf_m - knn_m,colors = "Blues",cellnote=matrix(botnet_prob_result[1:1830,]$subclass,ncol=30,nrow=61))
```

### Subclass probability analisys (Attempt I)
KNN vs RF.
```{r}
a=mdf %>% filter(value< -0.09) %>% group_by(subclass) %>% summarise(totless009=n()) %>% arrange(desc(totless009))
b=mdf  %>% group_by(subclass) %>% summarise(total=n()) %>% arrange(desc(total))

subclass_percent_diff<-inner_join(a,b,by="subclass") %>% mutate(percent=totless009/total) %>% arrange(desc(total)) 
subclass_percent_diff

subclass_detections<-botnet_prob_result %>% mutate(detected_rf=ifelse(RamdomForest>0.5,"Botnet","Normal"),
                              detected_knn=ifelse(KNN>0.5,"Botnet","Normal")) %>% 
                              mutate(correct_rf=ifelse(detected_rf==TrueClass,1,0))%>%
                              mutate(correct_knn=ifelse(detected_knn==TrueClass,1,0)) %>% 
  group_by(subclass) %>% summarise(total_correct_rf=sum(correct_rf),total_correct_knn=sum(correct_knn))


inner_join(subclass_detections,subclass_percent_diff,by="subclass")

mdf<-cbind(mdf,rf=botnet_prob_result$RamdomForest[1:1830],knn=botnet_prob_result$KNN[1:1830],trueclass=botnet_prob_result$TrueClass[1:1830])
botnet_prob_result
mdf %>% filter(value< -0.09) %>% filter(trueclass=="Botnet")
mdf %>% filter(value> 0.09) %>% filter(trueclass=="Normal")


```
### Clustering and PCA
```{r clustering}

kmeans_mod<-kmeans(testing[,1:10],centers = 10)
testing_cluster<-cbind(testing,cluster=kmeans_mod$cluster)
pca<-prcomp(testing[,c(-11,-12)], center = TRUE, scale. = TRUE) 
pca_testing<-data.frame(pca$x,class=testing_cluster$class,
                              subclass=testing_cluster$subclass,
                               cluster=testing_cluster$cluster
                               )
g<-ggplot(pca_testing,aes(x=PC1,y=PC2))+
  geom_jitter(aes(color=as.factor(subclass),text=cluster,shape=class))+
  #geom_point(aes(shape=asignacion),size=3)+
  ylab("PC1")+xlab("PC2")+
  theme_classic()+
#scale_shape_manual(values=c(8,6))+
   guides(color=FALSE,alpha=FALSE)
ggplotly(g)

```

```{r}
#Normal probabilistic table
normal_prob_result = data.frame(testing$class, predsrfprobs$Normal, knnPredict$Normal, logicRPredict$Normal, naiveBayesPredict$Normal ,svmPredict$Normal)
names(normal_prob_result) = c('True Class','Ramdom Forest','KNN','Logistic Regression', 'Naive Bayes', 'Suport VM')
normal_prob_result
```

